{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47f8d2d1-6e1d-405e-b9dd-fa3be6c84ec6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "with open('stopwords.txt', 'r', encoding='utf-8') as f:\n",
    "    custom_stopwords = {w.strip().lower() for w in f.readlines()}\n",
    "stop_words = stop_words.union(custom_stopwords)\n",
    "\n",
    "liwc_dict_path = r\"LIWC2015 Dictionary.dic\"\n",
    "liwc_prepro_words = set()\n",
    "\n",
    "with open(liwc_dict_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "sep_index = lines.index('%\\n') if '%\\n' in lines else lines.index('%\\r\\n')\n",
    "\n",
    "category_map = {}\n",
    "for line in lines[1:sep_index]:\n",
    "    parts = line.strip().split('\\t')\n",
    "    if len(parts) >= 2:\n",
    "        cat_id, cat_name = parts[0], parts[1].lower()\n",
    "        category_map[cat_id] = cat_name\n",
    "\n",
    "target_cats = {cid for cid, cname in category_map.items()\n",
    "               if \"pron\" in cname or \"prep\" in cname}\n",
    "\n",
    "for line in lines[sep_index+1:]:\n",
    "    parts = line.strip().split('\\t')\n",
    "    if len(parts) >= 2:\n",
    "        word = parts[0].lower().strip()\n",
    "        cats = parts[1:]\n",
    "        if any(cat in target_cats for cat in cats):\n",
    "            liwc_prepro_words.add(word.replace('*', ''))  # 去掉通配符\n",
    "stop_words = stop_words.union(liwc_prepro_words)\n",
    "\n",
    "\n",
    "def process_df(df):\n",
    "    df['Feature'] = df['Feature'].astype(str).apply(\n",
    "        lambda x: re.sub(r'[^\\w\\s-]', '', x).lower()\n",
    "    )\n",
    "    df = df[~df['Feature'].isin(stop_words)]\n",
    "    df = df[df['Beta'] > 0]\n",
    "    df = df.reset_index(drop=True)\n",
    "    return df\n",
    "\n",
    "corpus = pd.read_csv('manifesto_corpus.csv') #This material is obtained through the R language textcleaning_dand_SDA.R program\n",
    "#The following four documents are the SDA results obtained by the textcleaning_1and_SDA. R program\n",
    "authoritarianism = pd.read_csv('authoritarianism_shrinkage_beta.csv')\n",
    "morality = pd.read_csv('morality_shrinkage_beta.csv')\n",
    "life = pd.read_csv('life_shrinkage_beta.csv')\n",
    "eco = pd.read_csv('eco_shrinkage_beta.csv')\n",
    "\n",
    "authoritarianism_0 = process_df(authoritarianism)\n",
    "morality_0 = process_df(morality)\n",
    "life_0 = process_df(life)\n",
    "eco_0 = process_df(eco)\n",
    "\n",
    "authoritarianism_0.to_csv(\"authoritarianism_0.csv\", index=False)\n",
    "morality_0.to_csv(\"morality_0.csv\", index=False)\n",
    "life_0.to_csv(\"life_0.csv\", index=False)\n",
    "eco_0.to_csv(\"eco_0.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f5a9bbf-8820-42bb-99a1-4fc49cf290b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "authoritarianism_0['Source'] = 'authoritarianism'\n",
    "morality_0['Source'] = 'morality'\n",
    "life_0['Source'] = 'life'\n",
    "eco_0['Source'] = 'eco'\n",
    "\n",
    "combined = pd.concat([\n",
    "    authoritarianism_0,\n",
    "    morality_0,\n",
    "    life_0,\n",
    "    eco_0\n",
    "], ignore_index=True)\n",
    "\n",
    "filtered = combined.loc[combined.groupby('Feature')['Beta'].idxmax()].reset_index(drop=True)\n",
    "authoritarianism_final = filtered[filtered['Source']=='authoritarianism'].reset_index(drop=True)\n",
    "morality_final = filtered[filtered['Source']=='morality'].reset_index(drop=True)\n",
    "life_final = filtered[filtered['Source']=='life'].reset_index(drop=True)\n",
    "eco_final = filtered[filtered['Source']=='eco'].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e10b5de-de44-40b9-8df5-3f2f6dcf19c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer, util\n",
    "import numpy as np\n",
    "\n",
    "model = SentenceTransformer(\"sentence-transformers/paraphrase-MiniLM-L6-v2\")\n",
    "\n",
    "autho_list = authoritarianism_final['Feature'].tolist()\n",
    "autho_emb = model.encode(autho_list, convert_to_tensor=True)\n",
    "autho_expansion_bert = {}\n",
    "for i, word in enumerate(autho_list):\n",
    "    sim = util.cos_sim(autho_emb[i], autho_emb)[0]\n",
    "    idx = (sim > 0.7).cpu().numpy().nonzero()[0]  # similarity threshold\n",
    "    autho_expansion_bert[word] = {autho_list[j] for j in idx if j != i}\n",
    "autho_set = set(autho_expansion_bert.keys()) | set().union(*autho_expansion_bert.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed7ece0d-e029-45e1-b1ee-1d9b63f2ea3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "morality_list = morality_final['Feature'].tolist()\n",
    "morality_emb = model.encode(morality_list, convert_to_tensor=True)\n",
    "morality_expansion_bert = {}\n",
    "for i, word in enumerate(morality_list):\n",
    "    sim = util.cos_sim(morality_emb[i], morality_emb)[0]\n",
    "    idx = (sim > 0.7).cpu().numpy().nonzero()[0] \n",
    "    morality_expansion_bert[word] = {morality_list[j] for j in idx if j != i}\n",
    "morality_set = set(morality_expansion_bert.keys()) | set().union(*morality_expansion_bert.values())\n",
    "\n",
    "\n",
    "life_list = life_final['Feature'].tolist()\n",
    "life_emb = model.encode(life_list, convert_to_tensor=True)\n",
    "life_expansion_bert = {}\n",
    "for i, word in enumerate(life_list):\n",
    "    sim = util.cos_sim(life_emb[i], life_emb)[0]\n",
    "    idx = (sim > 0.7).cpu().numpy().nonzero()[0]\n",
    "    life_expansion_bert[word] = {life_list[j] for j in idx if j != i}\n",
    "life_set = set(life_expansion_bert.keys()) | set().union(*life_expansion_bert.values())\n",
    "\n",
    "\n",
    "eco_list = eco_final['Feature'].tolist()\n",
    "eco_emb = model.encode(eco_list, convert_to_tensor=True)\n",
    "eco_expansion_bert = {}\n",
    "for i, word in enumerate(eco_list):\n",
    "    sim = util.cos_sim(eco_emb[i], eco_emb)[0]\n",
    "    idx = (sim > 0.7).cpu().numpy().nonzero()[0]\n",
    "    eco_expansion_bert[word] = {eco_list[j] for j in idx if j != i}\n",
    "eco_set = set(eco_expansion_bert.keys()) | set().union(*eco_expansion_bert.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a5bac8a-6887-4050-804f-cf9938685bc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "import string\n",
    "from collections import Counter\n",
    "from itertools import product\n",
    "\n",
    "# preprocess\n",
    "def clean_text(text):\n",
    "    return text.lower().translate(str.maketrans('', '', string.punctuation))\n",
    "\n",
    "corpus['clean_text'] = corpus['text'].astype(str).apply(clean_text)\n",
    "corpus['tokens'] = corpus['clean_text'].str.split()\n",
    "\n",
    "# define keywords set\n",
    "topic_sets = {\n",
    "    'autho': autho_set,\n",
    "    'morality': morality_set,\n",
    "    'life': life_set,\n",
    "    'eco': eco_set\n",
    "}\n",
    "\n",
    "# coexist function\n",
    "def compute_cooccurrence(set1, set2, tokens_list):\n",
    "    matrix = pd.DataFrame(0, index=sorted(set1), columns=sorted(set2))\n",
    "\n",
    "    # tqdm 对 tokens_list 进行进度显示\n",
    "    for tokens in tqdm(tokens_list, desc=\"Calculating co-occurrence\"):\n",
    "        tokens_set = set(tokens)\n",
    "        s1_words = tokens_set & set1\n",
    "        s2_words = tokens_set & set2\n",
    "        for w1, w2 in product(s1_words, s2_words):\n",
    "            matrix.loc[w1, w2] += 1\n",
    "\n",
    "    return matrix\n",
    "\n",
    "\n",
    "co_matrix_autho_moral = compute_cooccurrence(autho_set, morality_set, corpus['tokens'])\n",
    "all_words = list(autho_set | morality_set | life_set | eco_set)\n",
    "word_freq = Counter(word for tokens in corpus['tokens'] for word in tokens if word in all_words)\n",
    "def build_freq_table(set1, set2):\n",
    "    df = pd.DataFrame({\n",
    "        'id': list(set1 | set2),\n",
    "    })\n",
    "    df['weight'] = df['id'].apply(lambda w: word_freq.get(w, 0))\n",
    "    df['label'] = df['id'].apply(lambda w: 0 if w in set1 else 1)\n",
    "    return df\n",
    "\n",
    "freq_autho_moral = build_freq_table(autho_set, morality_set)\n",
    "co_matrix_autho_moral.to_excel(\"autho_moral.xlsx\")\n",
    "freq_autho_moral.to_excel(\"autho_moral_freq.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef1c776a-987d-4dd2-9d92-fcad5282f1eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "co_matrix_autho_life = compute_cooccurrence(autho_set, life_set, corpus['tokens'])\n",
    "freq_autho_life = build_freq_table(autho_set, life_set)\n",
    "co_matrix_autho_life.to_excel(\"coexist\\\\autho_life.xlsx\")\n",
    "freq_autho_life.to_excel(\"coexist\\\\autho_life_freq.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "047c7e25-b5e7-45de-87b5-c186ff987afb",
   "metadata": {},
   "outputs": [],
   "source": [
    "co_matrix_autho_eco = compute_cooccurrence(autho_set, eco_set, corpus['tokens'])\n",
    "freq_autho_eco = build_freq_table(autho_set, eco_set)\n",
    "co_matrix_autho_eco.to_excel(\"coexist\\\\autho_eco.xlsx\")\n",
    "freq_autho_eco.to_excel(\"coexist\\\\autho_eco_freq.xlsx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29a758ff-1c27-4684-93ff-f39db635e9cb",
   "metadata": {},
   "source": [
    "PMI Calculating - Take Morality as an example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8e25f92-c622-48d9-a988-f40d8323adc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_freq_table_single(word_set):\n",
    "    df = pd.DataFrame({\n",
    "        'id': list(word_set)\n",
    "    })\n",
    "    df['weight'] = df['id'].apply(lambda w: word_freq.get(w, 0))\n",
    "    return df\n",
    "\n",
    "N = 0\n",
    "for text in tqdm(corpus['text'].astype(str), desc=\"Counting tokens\"):\n",
    "    N += len(text.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af310883-1b8b-4ff0-8058-d8dc1ef5df21",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import networkx as nx\n",
    "\n",
    "freq_autho = build_freq_table_single(autho_set)\n",
    "freq_moral = build_freq_table_single(eco_set)\n",
    "autho_moral = co_matrix_autho_eco\n",
    "\n",
    "# PMI calculating\n",
    "co_matrix = autho_moral.copy()\n",
    "co_matrix_s = co_matrix + 1  # add-1 smoothing\n",
    "\n",
    "freq_a = freq_autho['weight'].reindex(co_matrix.index).fillna(1)\n",
    "freq_b = freq_moral['weight'].reindex(co_matrix.columns).fillna(1)\n",
    "\n",
    "expected = np.outer(freq_a, freq_b) / N\n",
    "\n",
    "PMI = np.log2(co_matrix_s / expected)\n",
    "PMI = pd.DataFrame(PMI, index=co_matrix.index, columns=co_matrix.columns)\n",
    "\n",
    "\n",
    "# word ranking\n",
    "conn_autho = autho_moral.sum(axis=1).sort_values(ascending=False)\n",
    "conn_moral = autho_moral.sum(axis=0).sort_values(ascending=False)\n",
    "\n",
    "df_autho = pd.DataFrame({\n",
    "    \"Keyword\": conn_autho.index,\n",
    "    \"WeightedConnectivity\": conn_autho.values,\n",
    "    \"ConnectivityEfficiency\": conn_eff_autho.reindex(conn_autho.index).values,\n",
    "    \"Theme\": \"Autho\"\n",
    "})\n",
    "df_moral = pd.DataFrame({\n",
    "    \"Keyword\": conn_moral.index,\n",
    "    \"WeightedConnectivity\": conn_moral.values,\n",
    "    \"ConnectivityEfficiency\": conn_eff_moral.reindex(conn_moral.index).values,\n",
    "    \"Theme\": \"Moral\"\n",
    "})\n",
    "result = pd.concat([df_autho, df_moral], axis=0, ignore_index=True)\n",
    "result_sorted = result.sort_values(\"WeightedConnectivity\", ascending=False)\n",
    "result_sorted.to_csv(\"coexist\\\\wconnectivity_eco.csv\", index=False)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "topA = conn_autho.head(15)\n",
    "topB = conn_moral.head(15)\n",
    "\n",
    "# plot Top Weighted Connectivity words\n",
    "# plt.figure(figsize=(12,6))\n",
    "# topA.plot(kind='bar')\n",
    "# plt.title(\"Autho Top Weighted Connectivity\")\n",
    "# plt.xticks(rotation=60)\n",
    "# plt.tight_layout()\n",
    "# plt.show()\n",
    "\n",
    "# plt.figure(figsize=(12,6))\n",
    "# topB.plot(kind='bar')\n",
    "# plt.title(\"Moral Top Weighted Connectivity\")\n",
    "# plt.xticks(rotation=60)\n",
    "# plt.tight_layout()\n",
    "# plt.show()\n",
    "import networkx as nx\n",
    "\n",
    "# plot coexistence pic\n",
    "G = nx.Graph()\n",
    "G.add_nodes_from(autho_moral.index, bipartite=0)\n",
    "G.add_nodes_from(autho_moral.columns, bipartite=1)\n",
    "threshold = autho_moral.values.mean()\n",
    "for a in autho_moral.index:\n",
    "    for b in autho_moral.columns:\n",
    "        if autho_moral.loc[a,b] > threshold:\n",
    "            G.add_edge(a, b, weight=autho_moral.loc[a,b])\n",
    "# pos = nx.spring_layout(G, k=0.15, iterations=40)\n",
    "# plt.figure(figsize=(16,12))\n",
    "# nx.draw(G, pos,\n",
    "#         with_labels=True,\n",
    "#         node_size=700,\n",
    "#         font_size=9,\n",
    "#         width=[G[u][v]['weight']/max(nx.get_edge_attributes(G,'weight').values())*3 for u,v in G.edges()])\n",
    "# plt.title(\"Autho–Moral Bipartite Weighted Network\")\n",
    "# plt.show()\n",
    "\n",
    "# PMI heatmap\n",
    "# top_a = autho_contrib.head(20).index\n",
    "# top_b = moral_contrib.head(20).index\n",
    "\n",
    "total_strength = co_matrix.values.sum()\n",
    "cosine_similarity = np.dot(autho_contrib.values, moral_contrib.values) / \\\n",
    "                    (np.linalg.norm(autho_contrib.values) * np.linalg.norm(moral_contrib.values))\n",
    "#Export PMI values\n",
    "PMI.to_csv(\"coexist\\\\pmi_eco.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7050c530-5af6-49d6-a0cc-09ecb99632bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "PMI"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f76e3d7f-0c32-47b9-98b6-d1b2adbef716",
   "metadata": {},
   "source": [
    "Figure 3 Plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b103568-87e4-4371-b308-f933844360b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "top20_ids = freq_autho.sort_values(by='weight', ascending=False)['id'].tolist()\n",
    "def load_and_filter(path, ids):\n",
    "    df = pd.read_csv(path, index_col=0)\n",
    "    df = df.loc[df.index.intersection(ids)]\n",
    "    df = df.reindex(ids)\n",
    "    return df\n",
    "pmimoral = load_and_filter(\"coexist/pmi_moral.csv\", top20_ids)\n",
    "pmilife = load_and_filter(\"coexist/pmi_life.csv\", top20_ids)\n",
    "pmieco = load_and_filter(\"coexist/pmi_eco.csv\", top20_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0edf95a9-2ce5-47a4-a5ee-2196a12f33cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import font_manager as fm\n",
    "from scipy.stats import f_oneway\n",
    "\n",
    "freq_autho = freq_autho.sort_values(by='weight', ascending=False)\n",
    "first_half_ids = freq_autho['id'].iloc[:44].tolist()   # 44\n",
    "second_half_ids = freq_autho['id'].iloc[44:].tolist()  # 45-\n",
    "\n",
    "def load_and_filter(path, ids):\n",
    "    df = pd.read_csv(path, index_col=0)\n",
    "    df = df.loc[df.index.intersection(ids)]\n",
    "    df = df.reindex(ids)\n",
    "    return df\n",
    "\n",
    "pmimoral = load_and_filter(\"coexist/pmi_moral.csv\", freq_autho['id'].tolist())\n",
    "pmilife  = load_and_filter(\"coexist/pmi_life.csv\", freq_autho['id'].tolist())\n",
    "pmieco   = load_and_filter(\"coexist/pmi_eco.csv\", freq_autho['id'].tolist())\n",
    "\n",
    "# Mean and 95%CI\n",
    "def calc_mean_ci(df):\n",
    "    mean = df.mean(axis=1)\n",
    "    se = df.std(axis=1) / np.sqrt(df.shape[1])\n",
    "    ci = 1.96 * se\n",
    "    return mean, ci\n",
    "\n",
    "mean_moral, ci_moral = calc_mean_ci(pmimoral)\n",
    "mean_life, ci_life = calc_mean_ci(pmilife)\n",
    "mean_eco, ci_eco = calc_mean_ci(pmieco)\n",
    "\n",
    "# single plot function\n",
    "def plot_pmi(ids_subset, ax, title):\n",
    "    words = ids_subset[::-1] \n",
    "    y = np.arange(len(words)) * 2 \n",
    "    offset = 0.3\n",
    "\n",
    "    m = mean_moral.loc[words]\n",
    "    l = mean_life.loc[words]\n",
    "    e = mean_eco.loc[words]\n",
    "    ci_m = ci_moral.loc[words]\n",
    "    ci_l = ci_life.loc[words]\n",
    "    ci_e = ci_eco.loc[words]\n",
    "\n",
    "    # ANOVA & sig.\n",
    "    stars = []\n",
    "    for word in words:\n",
    "        data = [pmimoral.loc[word].values,\n",
    "                pmilife.loc[word].values,\n",
    "                pmieco.loc[word].values]\n",
    "        f_val, p_val = f_oneway(*data)\n",
    "        if p_val < 0.01:\n",
    "            stars.append('***')\n",
    "        elif p_val < 0.05:\n",
    "            stars.append('**')\n",
    "        elif p_val < 0.1:\n",
    "            stars.append('*')\n",
    "        else:\n",
    "            stars.append('')\n",
    "\n",
    "    ax.errorbar(m, y - offset, xerr=ci_m, fmt='o', capsize=4, label='PMI Moral')\n",
    "    ax.errorbar(l, y, xerr=ci_l, fmt='s', capsize=4, label='PMI Life')\n",
    "    ax.errorbar(e, y + offset, xerr=ci_e, fmt='^', capsize=4, label='PMI Eco')\n",
    "\n",
    "    for word, yi, val_m, val_l, val_e, s in zip(words, y, m, l, e, stars):\n",
    "        if s:\n",
    "            xmax = max(val_m + ci_m[word], val_l + ci_l[word], val_e + ci_e[word])\n",
    "            ax.text(xmax + 0.01, yi, s, va='center', fontsize=10)\n",
    "\n",
    "    ax.set_yticks(y)\n",
    "    ax.set_yticklabels(words, fontsize=10)\n",
    "    ax.set_xlabel(\"Mean PMI (±95% CI)\", fontsize=12)\n",
    "    ax.set_title(title, fontsize=14)\n",
    "    ax.grid(axis='x', linestyle='--', alpha=0.4)\n",
    "\n",
    "# Splicing two images together\n",
    "font_path = r\"C:\\Windows\\Fonts\\times.ttf\"\n",
    "fm.fontManager.addfont(font_path)\n",
    "plt.rcParams['font.family'] = 'Times New Roman'\n",
    "fig, axes = plt.subplots(1, 2, figsize=(20, 15), sharex=True)\n",
    "plot_pmi(first_half_ids, axes[0], \"Top 44 Keywords (High Weight)\")\n",
    "plot_pmi(second_half_ids, axes[1], \"Bottom 44 Keywords (Lower Weight)\")\n",
    "axes[0].legend(fontsize=10)\n",
    "axes[1].legend(fontsize=10)\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"coexist\\\\keywords_pmi_comparison.png\", dpi=600, bbox_inches='tight') \n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
